// Program to perform new NSW simulation
capture program drop nswdgp
program define nswdgp
syntax[,k(real 1) replicas(integer 1) post]
/* THIS PART HAS TO BE RUN ONLY ONCE. AFTER THAT USE THE DTA
use "$library/simulation/DGP_NSW_data_all.dta", clear
// STEP 0. Keep observational data. Only for black. Then rescale nominal variables
       drop if group<2
       keep if black==1
       foreach y in 74 75 78{
         replace earn`y' = earn`y'/1000
         replace unempl`y' = earn`y'<0.000001
       }

// STEP 1. Get population coefficients for selection and outcome eqs
       local lin= "one age educ married unempl74 unempl75 earn74 earn75 dropout"
       local int= "unempl74_unempl75 earn74sq earn75sq earn74_earn75"

	   gen one = 1
	   gen byte unempl74_unempl75 = unempl74*unempl75
       gen double earn75sq  = (earn75/1)^2
       gen double earn74sq  = (earn74/1)^2
       gen double earn74_earn75= (earn74/1)*(earn75/1)
       gen byte dropout = educ<12

	   logit T `lin' `int', nocons
       mat gamma = e(b)'

	   reg earn78 `lin' `int' if T==1, nocons
       mat beta1 = e(b)'
	   local s1=e(rmse)
       predict double mu1X
       reg earn78 `lin' `int' if T==0, nocons
       mat beta0 = e(b)'
	   local s0=e(rmse)
       predict double mu0X
       gen dif = mu1X-mu0X
       sum dif if T==1
       local TOT = r(mean)
	   drop dif

	   mata: gamma= st_matrix("gamma")
	   mata: beta0= st_matrix("beta0")
	   mata: beta1= st_matrix("beta1")
       mata: S = (`s0', `s1')


// STEP 2. Calculate moments in MATA and save them into a Stata file
      local disc_X = "married unempl74 unempl75"
      local cont_X = "age educ  earn74  earn75"
	  
      egen cells = group(`disc_X' T)

     // Loop over cells
        sum cells
        local CELLS = r(max)
        mata: V   = J(4,4,.)
        mata: MEAN= .
        mata: MIN = .
        mata: MAX = .
        mata: SD  = .

        forvalues c = 1(1)`CELLS'{
            qui tabstat `cont_X' if cells==`c', statistics(mean min max sd) save
            matrix H = r(StatTotal)'
			mata:  H = st_matrix("H")
            mata: MEAN= MEAN\H[.,1]
            mata: MIN = MIN\H[.,2]
            mata: MAX = MAX\H[.,3]
            mata: SD  = SD \H[.,4]

		    qui count if cells==`c'
			if r(N)==1{
			   mata: V= V\J(4,4,.)
			}
 		    if r(N)>1{
			   qui correl `cont_X' if cells==`c', cov
               matrix H = r(C)
			   mata:  H = st_matrix("H")
			   mata: V= V\H
			}
         }
		 
// STEP 3. Organize everything into a stata dataset at the cell level
     mata: MEAN= MEAN[2..rows(MEAN),1]
     mata: MIN = MIN[2..rows(MIN),1]
     mata: MAX = MAX[2..rows(MAX),1]
     mata: SD  = SD[2..rows(SD),1]
     mata: V   = V[5..rows(V),.]
	 mata: v1=V[.,1] 
	 mata: v2=V[.,2] 
	 mata: v3=V[.,3] 
	 mata: v4=V[.,4] 
 
	 collapse (mean) `disc_X' T (sum) one, by(cells)
	 ren one N
	 expand 4
	 bysort cells: gen o = _n
	 gen varname = ""
	 local i = 1
	 foreach var in `cont_X'{
	    replace varname = "`var'" if o==`i'
		local i = `i'+1
	 }
     getmata MEAN MIN MAX SD v1 v2 v3 v4
	 ren v1 v_age 
	 ren v2 v_educ  
	 ren v3 v_earn74  
	 ren v4 v_earn75  
	 
	foreach var in age educ  earn74  earn75{
	  replace v_`var' = 0.0001 if v_`var'==0
	}
save "nswmoms.dta", replace	 
*/

// STEP 4. Draw new data {X,T} based on the moments calculated in the previous steps.
use "nswmoms.dta", replace
replace N = N*`replicas'
qui{
sum cells
local CELLS = r(max)
mata: R = J(1,4,.)
mata: F = J(1,2,.)
mata: M = J(1,4,.)
forvalues c = 1(1)`CELLS'{
	putmata MEAN if cells==`c', replace
	putmata V =  (v_age v_educ v_earn74 v_earn75) if cells==`c', replace
	putmata N = N if cells==`c', replace
	putmata SD=SD if cells==`c', replace
	mata{
		N = mean(N)
		R=R\(MEAN':+invnormal(uniform(N,cols(V)))*cholesky(V)')
        //in the cases in which the cholesky matrix is undefined we draw only for age and educ
		K = (1\1\0\0)
		G =select(V,K)
		G =select(G,K')
		B =select(MEAN,K)
		tol=0.00001
		SIGMA=cholesky(G+J(rows(G),cols(G),tol))'
		F = F\(B':+invnormal(uniform(N,cols(G)))*SIGMA)
		//also save the means to be used in the case of cells with one obs.
		M = M\MEAN':*J(N,4,1)
    }
}

mata{
    M= M[2..rows(M),.]  //has the means
    R= R[2..rows(R),.]  //has the normal draws for {age, educ, earn74, earn75} in the cases it could be drawn
	F= F[2..rows(F),.]  //has the normal draws for {age, educ} in the cases it could be drawn
	F= F,J(rows(F),2,0)
	R= editmissing(R,0) //if a draw for {age, educ, earn74, earn75} is available, use it.
	O= R[.,1]:==0
	R= R+O:*F           //if not, use a draw for {age, educ} 
	O= R[.,1]:==.
	R= editmissing(R,0)
	R= R+O:*M           //if not, use the mean
}
// STEP 5. Move the stuff from MATA into STATA. Impose mins and maxs. Discretize vars
	keep  married unempl74 unempl75 N MIN MAX cells varname
	reshape wide MIN MAX , j(varname) i( cells married unempl74 unempl75 N) string

	expand N
	sort cells
	getmata (age educ earn74 earn75) = R
	order  cells married unempl74 unempl75 age educ earn74 earn75
	foreach var in age educ earn74 earn75{
	 replace `var' = MIN`var' if `var'<MIN`var'
	 replace `var' = MAX`var' if `var'>MAX`var'
	}
	drop MIN* MAX* N cells

	// Discretize age and education   
	replace age = round(age,1)    
	replace educ= round(educ,1)

// STEP 6. Compute selection equation
	mat gamma=[ 1.2035393, -.08755592, .1389975, -1.2600823, 2.4363385,-2.8215169, -.01612219, -.42033397, .9784195  , 1.7345521, .00295543, .00987434, -.00458513]'

	//Define variables we need
       gen byte unempl74_unempl75 = unempl74*unempl75
       gen double earn75sq  = (earn75/1)^2
       gen double earn74sq  = (earn74/1)^2
       gen double earn74_earn75= (earn74/1)*(earn75/1)
       gen byte dropout = educ<12
	   gen one = 1

     // Selection equation on simulated data
       local lin= "one age educ married unempl74 unempl75 earn74 earn75 dropout"
       local int= "unempl74_unempl75 earn74sq earn75sq earn74_earn75"
	   putmata X = (`lin' `int'), replace

       //2. u, Tstar equation and T
	   mata: gamma= st_matrix("gamma")
	   mata: xB = X*gamma
	   getmata xB=xB
       gen U = runiform()
       gen logistic = log(U/(1-U))
       gen Tstar = xB - `k'*logistic
	   gen pX = exp(xB/`k')/(1+exp(xB/`k'))
       gen T = (Tstar>=0)

// STEP 7. Treatment effects and outcome functions
	mat beta0=[ 4.0405884, -.07082189, .11395986, 1.3001038,-.73488037, 2.4357905, .64041985 , .21564211 , -1.9191203, -1.7917938, .00782677, .02267225, -.02937706]'
	mat beta1=[ 1.0554232,  .09422149, .44930771,-.05285189, 5.9115728, 1.3466187, -.65657639, -.29606088, -1.6452337, -8.6567537, -.00301478, -.06871041, .15483496]'
	local s0 = 8.325797145   
	local s1 = 7.595139668

    // On simulated data. beta comes from real data, Xs are simulated
	   mata: beta0= st_matrix("beta0")
	   mata: xbeta0 = X*beta0
	   getmata mu0X=xbeta0
	   mata: beta1= st_matrix("beta1")
	   mata: xbeta1 = X*beta1
	   getmata mu1X=xbeta1
       gen sigma1 = `s1'
       gen sigma0 = `s0'
       gen Y1=mu1X+`s1'*invnorm(uniform())
       gen Y0=mu0X+`s0'*invnorm(uniform())
       gen Y=T*Y1+(1-T)*Y0
	   ren Y y1
}  // closes qui		
end

*nswdgp, k(1) replicas(1) post

